/*
 * Copyright 2011 Harald Wellmann
 * Copyright (c) 2002, 2008 Cunningham & Cunningham, Inc.
 *
 * This file is part of reFit.
 * 
 * reFit is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * reFit is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with reFit.  If not, see <http://www.gnu.org/licenses/>.
 */
package fit;

import java.io.*;
import java.text.ParseException;

/**
 * This class implements a partial HTML parser as well as a parse tree node generated by the
 * parser (and thus is a candidate for refactoring).
 * <p>
 * For executing a FIT test, the HTML file containing the test specification is represented as a 
 * tree of tables, rows and cells, so by default, the parser is looking for &lt;table>, &lt;tree> 
 * and &lt;tag> elements as indicated by the {@code tags} member for building the desired tree 
 * structure.
 * <p>
 * Each instance of this class also represents a node of the parse tree, where {@code more}
 * points to the next sibling node and {@code parts} points to the first child node.
 * <p>
 * The {@code leader} and {@code trailer} members contain unparsed text preceding or following
 * the current element.
 * <p>
 * {@code tag} is the opening tag of the current element, including the angular brackets and
 * any attributes.
 * <p>
 * The textual content of an element (if any) is stored in {@code body}.
 *    
 * @author Harald Wellmann
 *
 */
public class Parse {

    public static int footnoteFiles = 0;

    /** Unparsed text preceding this node. */
    public String leader;
    
    /** Opening HTML tag of this node. */
    public String tag;
    
    /** Unparsed content of this node. */
    public String body;
    
    /** Closing HTML tag of this node. */
    public String end;
    
    /** Unparsed text following this node. */
    public String trailer;

    /** Next sibling node (or null). */
    public Parse more;
    
    /** First child node (or null). */
    public Parse parts;

    /**
     * Creates a parse tree node from the given parameters.
     * @param tag
     * @param body
     * @param parts
     * @param more
     */
    public Parse(String tag, String body, Parse parts, Parse more) {
        this.leader = "\n";
        this.tag = "<" + tag + ">";
        this.body = body;
        this.end = "</" + tag + ">";
        this.trailer = "";
        this.parts = parts;
        this.more = more;
    }

    /**
     * The list of tags to be considered by the parse for building the parse tree.
     */
    public static String tags[] = { "table", "tr", "td" };

    /**
     * Creates a parse tree from the given text by parsing it completely.
     * @param text   text to be parsed
     * @throws ParseException
     */
    public Parse(String text) throws ParseException {
        this(text, tags, 0, 0);
    }

    /**
     * Creates a parse tree from the given text by parsing it completely, creating a hierarchy
     * of child nodes according to the given list of tags.
     * @param text   text to be parsed
     * @param tags   Tags corresponding to the tree levels. All nodes at level {@code i} will have
     *               a tag matching {@code tags[i]} (the virtual root node of the tree can be
     *               regarded as having level -1).
     * @throws ParseException
     */
    public Parse(String text, String tags[]) throws ParseException {
        this(text, tags, 0, 0);
    }

    /**
     * Continues building a given level of the parse tree from a given position in a given string
     * which has been partially parsed.
     * @param text   text to be parsed
     * @param tags   Tags corresponding to the tree levels. All nodes at level {@code i} will have
     *               a tag matching {@code tags[i]}.
     * @param level  level of node to be created
     * @param offset current position of the parser in the input string              
     * @throws ParseException
     */
    public Parse(String text, String tags[], int level, int offset) throws ParseException {
        String lc = text.toLowerCase();
        int startTag = lc.indexOf("<" + tags[level]);
        int endTag = lc.indexOf(">", startTag) + 1;
        // int startEnd = lc.indexOf("</"+tags[level], endTag);
        int startEnd = findMatchingEndTag(lc, endTag, tags[level], offset);
        int endEnd = lc.indexOf(">", startEnd) + 1;
        int startMore = lc.indexOf("<" + tags[level], endEnd);
        if (startTag < 0 || endTag < 0 || startEnd < 0 || endEnd < 0) {
            throw new ParseException("Can't find tag: " + tags[level], offset);
        }

        leader = text.substring(0, startTag);
        tag = text.substring(startTag, endTag);
        body = text.substring(endTag, startEnd);
        end = text.substring(startEnd, endEnd);
        trailer = text.substring(endEnd);

        if (level + 1 < tags.length) {
            parts = new Parse(body, tags, level + 1, offset + endTag);
            body = null;
        }
        else { // Check for nested table
            int index = body.indexOf("<" + tags[0]);
            if (index >= 0) {
                parts = new Parse(body, tags, 0, offset + endTag);
                body = "";
            }
        }

        if (startMore >= 0) {
            more = new Parse(trailer, tags, level, offset + endEnd);
            trailer = null;
        }
    }

    /* Added by Rick Mugridge, Feb 2005 */
    
    /**
     * Finds the start position of the matching closing tag.
     * 
     * @param lc        input string converted to lower case
     * @param matchFromHere     start position of the matcher
     * @param tag       tag to be matched
     * @param offset    TODO unclear, only used in exception message 
     */
    protected static int findMatchingEndTag(String lc, int matchFromHere, String tag, int offset)
            throws ParseException {
        int fromHere = matchFromHere;
        int count = 1;
        int startEnd = 0;
        while (count > 0) {
            int embeddedTag = lc.indexOf("<" + tag, fromHere);
            int embeddedTagEnd = lc.indexOf("</" + tag, fromHere);
            // Which one is closer?
            if (embeddedTag < 0 && embeddedTagEnd < 0)
                throw new ParseException("Can't find tag: " + tag, offset);
            if (embeddedTag < 0)
                embeddedTag = Integer.MAX_VALUE;
            if (embeddedTagEnd < 0)
                embeddedTagEnd = Integer.MAX_VALUE;
            if (embeddedTag < embeddedTagEnd) {
                count++;
                startEnd = embeddedTag;
                fromHere = lc.indexOf(">", embeddedTag) + 1;
            }
            else if (embeddedTagEnd < embeddedTag) {
                count--;
                startEnd = embeddedTagEnd;
                fromHere = lc.indexOf(">", embeddedTagEnd) + 1;
            }
        }
        return startEnd;
    }
    
    /**
     * Returns the number of sibling nodes following this node.
     * For the first child node of a given parent, this is equal to the size of the child list.
     * 
     *  @return number of siblings following
     */
    public int size() {
        return more == null ? 1 : more.size() + 1;
    }

    /**
     * Returns the last node of the current sibling list.
     * @return last sibling
     */ 
    public Parse last() {
        return more == null ? this : more.last();
    }

    /**
     * Returns the first leaf descendant of the current node in depth-first traversal.
     * @return first leaf
     */
    public Parse leaf() {
        return parts == null ? this : parts.leaf();
    }

    /**
     * Returns the i-th sibling, (or this node, as default).
     */
    public Parse at(int i) {
        return i == 0 || more == null ? this : more.at(i - 1);
    }

    /**
     * Returns the j-th child of the i-th sibling. 
     */
    public Parse at(int i, int j) {
        return at(i).parts.at(j);
    }

    /**
     * Returns the k-th child of the j-th child of the i-th sibling. 
     */
    public Parse at(int i, int j, int k) {
        return at(i, j).parts.at(k);
    }

    /**
     * Converts the body of this node to plain text by normalizing line breaks and unescaping 
     * HTML entities.
     */ 
    public String text() {
        return htmlToText(body);
    }
    
    /**
     * Converts HTML text to plain text by normalizing line breaks and unescaping HTML entities.
     */ 
    public static String htmlToText(String s) {
        s = normalizeLineBreaks(s);
        s = removeNonBreakTags(s);
        s = condenseWhitespace(s);
        s = unescape(s);
        return s;
    }

    private static String removeNonBreakTags(String s) {
        int i = 0, j;
        while ((i = s.indexOf('<', i)) >= 0) {
            if ((j = s.indexOf('>', i + 1)) > 0) {
                if (!(s.substring(i, j + 1).equals("<br />"))) {
                    s = s.substring(0, i) + s.substring(j + 1);
                }
                else i++;
            }
            else break;
        }
        return s;
    }
    
    /**
     * Replaces HTML entities and breaks by plain unicode equivalents.
     */
    public static String unescape(String s) {
        s = s.replaceAll("<br />", "\n");
        s = unescapeNumericEntities(s);
        s = unescapeCharacterEntities(s);
        s = unescapeSmartQuotes(s);
        return s;
    }

    private static String unescapeSmartQuotes(String s) {
        s = s.replace('\u201c', '"');
        s = s.replace('\u201d', '"');
        s = s.replace('\u2018', '\'');
        s = s.replace('\u2019', '\'');
        return s;
    }

    private static String unescapeCharacterEntities(String s) {
        s = s.replaceAll("&lt;", "<");
        s = s.replaceAll("&gt;", ">");
        s = s.replaceAll("&nbsp;", " ");
        s = s.replaceAll("&quot;", "\"");
        s = s.replaceAll("&amp;", "&");
        return s;
    }

    private static String unescapeNumericEntities(String s) {
        String result = "";
        int lastStart = 0;
        int startsAt = s.indexOf("&#");
        int character;
        while (startsAt >= 0) {
            int endsAt = s.indexOf(';', startsAt);
            if (endsAt < 0) {
                startsAt = s.indexOf("&#", startsAt + 1);
                continue;
            }

            try {
                String entity = s.substring(startsAt + 2, endsAt);
                if (entity.startsWith("x") || entity.startsWith("X")) {
                    character = Integer.parseInt(entity.substring(1), 16);
                }
                else {
                    character = Integer.parseInt(entity, 10);
                }
                if (character <= 0xFFFF) {
                    result += s.substring(lastStart, startsAt) + (char) character;
                    lastStart = endsAt + 1;
                }
            }
            catch (NumberFormatException e) {
                // okay--we'll just loop around again
            }
            finally {
                startsAt = s.indexOf("&#", endsAt);
            }
        }
        result += s.substring(lastStart);

        return result;
    }

    private static String normalizeLineBreaks(String s) {
        s = s.replaceAll("<\\s*br\\s*/?\\s*>", "<br />");
        s = s.replaceAll("<\\s*/\\s*p\\s*>\\s*<\\s*p( .*?)?>", "<br />");
        return s;
    }

    /**
     * Removes all leading and trailing whitespace and replaces any other sequences of whitespace
     * characters or entities by a single blank space.
     * @param s   string to be whitespace-normalized
     * @return  normalized string
     */
    public static String condenseWhitespace(String s) {
        final char NON_BREAKING_SPACE = (char) 160;

        s = s.replaceAll("\\s+", " ");
        s = s.replace(NON_BREAKING_SPACE, ' ');
        s = s.replaceAll("&nbsp;", " ");
        s = s.trim();
        return s;
    }

    /**
     * Inserts the given text into the tag of this node, before the closing bracket. 
     * (Used to add attributes to a tag.) 
     * @param text   text to be inserted
     */
    public void addToTag(String text) {
        int last = tag.length() - 1;
        tag = tag.substring(0, last) + text + ">";
    }

    /**
     * Appends the given text to the body of this node.
     * @param text
     */
    public void addToBody(String text) {
        body = body + text;
    }

    /**
     * Recursively prints the content of this tree to the given stream.
     * @param out output stream
     */
    public void print(PrintWriter out) {
        out.print(leader);
        out.print(tag);
        if (parts != null) {
            parts.print(out);
        }
        else {
            out.print(body);
        }
        out.print(end);
        if (more != null) {
            more.print(out);
        }
        else {
            out.print(trailer);
        }
    }

    /**
     * TODO unclear, from Fit 1.1+.
     * @return
     */
    @Deprecated
    public String footnote() {
        if (footnoteFiles >= 25) {
            return "[-]";
        }
        else {
            try {
                int thisFootnote = ++footnoteFiles;
                String html = "footnotes/" + thisFootnote + ".html";
                File file = new File("Reports/" + html);
                file.delete();
                PrintWriter output = new PrintWriter(new BufferedWriter(new FileWriter(file)));
                print(output);
                output.close();
                return "<a href=/fit/Release/Reports/" + html + "> [" + thisFootnote + "]</a>";
            }
            catch (IOException e) {
                return "[!]";
            }
        }
    }
}
